source("helper_functions.R")

ClWildBS.adj.se <- TRUE                    # Compute adjusted SE for WildBS?

## Which rows in the table to compute by default
ClDefault.se <- ConvertSETypes(list("homo"="N",
                   "HC0"=c("N","t","wild","wild0"),
                   "HC1"=c("N","t"),
                   "HC2"=c("N","t","wild","wild0","Opt","BM","IK")))

## Set slope and intercept to zero
beta0 <- alpha0 <- 0

## DGP in simulations with clustering
ClDGP <- function(design, nsim) {
    S <- switch(design, "2"=5, "6"=, "7"=, "8"=, "9"=, "10"=50, 10)
    NS <- switch(design, "3" = c(10, 50),
                 "6" =, "7" =, "8" =, "9" =, "10" = 6, 30)
    n <- mean(S*NS)

    ## vector of group memberships
    group <- if(length(NS)==1) {
        rep(seq(S), 1, each=NS)
    } else {
        c(rep(1:(S/2), 1, each=NS[1]), rep( (S/2+1):S, 1, each=NS[2]))
    }

    sd.eta <- function(X) switch(design, "4"=3*abs(X), 1)
    sd.nu <- function(V) switch(design, "8"=1+V, "9"=1, "10"=2-V, 1)
    V <- matrix(switch(design, "5" = sqrt(2)*rnorm(S*nsim),
                       "7" = exp(rnorm(S*nsim)),
                       "8"=, "9"=, "10"=seq(S)<=3, rnorm(S*nsim)),
                nrow=S, ncol=nsim)
    W <- matrix(switch(design, "5"=, "7"=, "8"=, "9"=, "10"=0, rnorm(n*nsim)),
                nrow=n, ncol=nsim)
    X <- V[group, ] + W
    Y <- alpha0+X*beta0 + sd.eta(X)*matrix(rnorm(n*nsim), nrow=n) +
        (sd.nu(V)*matrix(rnorm(S*nsim), nrow=S))[group,]
    Z <- outer(group, unique(group), "==") # Matrix of group indicators

    list(Z=Z, X=X, Y=Y, sd.eta=sd.eta, Z.se=sd.nu(V[,1])*Z)
}

## Rescale asymptotic variance matrix in linear regression with clustering
ClScale <- function(X, H, Z, Vhat) {
    if (Vhat=="HC0") {
        return(X)
    } else if (Vhat=="HC1") {
        ## (N-1)/(N-1) * S/(S-1)
        return(sqrt(nrow(X)/(nrow(X)-ncol(X))*(ncol(Z)/(ncol(Z)-1)))*X)
    } else if (Vhat=="homo") {
        return(NA)
    } else {
        ## HC2
        r <- lapply(seq(ncol(Z)), function(j)
            MatSqrtInverse(diag(sum(Z[, j]))-H[Z[, j], Z[, j]]) %*% X[Z[, j], ])
        return(do.call(rbind, r))
    }
}

## Compute various statistics based on X
ClComputeXFunctions <- function(X, Z, ell, compute.G=FALSE) {
    XX <- crossprod(X)
    Xinv <- solve(XX, ell)
    H <- X %*% solve(XX,t(X))           # Hat matrix
    Xs <- lapply(c(HC0="HC0", HC1="HC1", HC2="HC2"),
                 function(Vhat) ClScale(X, H, Z, Vhat))
    G <- if (compute.G==TRUE) {
        apply(Z, 2, function(z)
            (diag(nrow(Z))-H)[,z] %*%  Xs$HC2[z,] %*% solve(XX, ell))
    } else {
        NA
    }

    list(Xs=Xs, G=G, Xinv=Xinv)
}

ClSE <- function(Vhat, Xf, u, Z, ell)
    if (Vhat=="homo") {
        return(sqrt(mean(u^2) * ell %*% Xf$Xinv))
    } else {
        return(sqrt(Xf$Xinv %*%
                        crossprod(crossprod(Z, u*Xf$Xs[[Vhat]])) %*% Xf$Xinv))
    }

ClSEs <- function(Y, X, Z, ell, Vhat, dof, beta0, sig.i=NULL,
                       Z.se=NULL, Xf=NULL) {
    ## Compute cluster robust standard errors

    n <- nrow(X)
    ols <- lm(Y~0+X)
    u <- ols$residuals

    if (is.null(Xf))
        Xf <- ClComputeXFunctions(X, Z, ell, compute.G=TRUE)

    se <- sapply(Vhat, ClSE, Xf, u, Z, ell)

    DFadjust <- function(sig.i, Z.se){
        Gm <- if(!is.null(Z.se)) crossprod(crossprod(Z.se, Xf$G)) else 0
        lambda <- eigen(crossprod(Xf$G, sig.i*Xf$G)+Gm,
                        only.values = TRUE)$values
        sum(lambda)^2/sum(lambda^2)
    }

    Moulton <- function() {
        ## Moulton estimates
        ssr <- sum(u^2)
        sigmu <- max((sum(crossprod(Z,u)^2)-ssr)/(sum(colSums(Z)^2)-n), 0)

        list(sig.i=max(ssr/n - sigmu, 0), Z.se=sqrt(sigmu)*Z)
    }

    HatDF <- function(df)
        switch(df,
               wild=,wild0=,N=Inf,
               t=ncol(Z)-1,
               Opt=DFadjust(sig.i, Z.se),
               BM=DFadjust(1,NULL),
               IK=do.call(DFadjust, Moulton()))

    hat.df <- unname(sapply(dof, function(r) Re(HatDF(r))))

    cv <- qt(0.975,df=hat.df)
    adj.t <- unname(abs(sum(ols$coefficients*ell)-beta0)/se*1.96/cv)
    adj.se <- unname(se*cv/qnorm(0.975))

    wild.ind <- dof=="wild" | dof=="wild0"
    if(sum(wild.ind)>0)
        for (i in which(wild.ind)) {
            r <- ClWildBS(Y, X, Z, ell, Vhat[i], dof[i], Xf=Xf, beta0=beta0,
                          adj.se=ClWildBS.adj.se)
            adj.t[i] <- r$adj.t
            adj.se[i] <- r$adj.se
        }
    list(adj.t=adj.t, adj.se=adj.se, df=hat.df)
}

ClSize <- function(nsim, nsimb, se.types, design) {
    ## Compute test size and standard erorrs in the designs with clustering

    set.seed(42)

    d <- ClDGP(design, nsim)              # generate data
    ell <- c(1,0)                       # do inference on first element
    n.cols <- nrow(se.types)
    df <- adj.se <- adj.t <- matrix(nrow=nsim, ncol=n.cols)

    for (j in seq(nsim)) {
        ## don't compute wild bootstrap if j>nsimb
        subset <- if (j>nsimb)
                      !grepl("wild", se.types$dof) else rep(TRUE, n.cols)

        r <- ClSEs(d$Y[,j], cbind(d$X[,j], 1), d$Z, ell,
                        se.types$Vhat[subset], se.types$dof[subset],
                        beta0=beta0, sig.i=d$sd.eta(d$X[,j])^2, Z.se=d$Z.se)
        adj.t[j,subset] <- r$adj.t
        adj.se[j,subset] <- r$adj.se
        df[j,subset] <- r$df
    }

    ret <- cbind(100*apply(adj.t<1.96, 2, mean, na.rm=TRUE),
                 apply(adj.se, 2, median, na.rm=TRUE),
                 apply(adj.se, 2, quantile, probs=0.05, na.rm=TRUE),
                 apply(adj.se, 2, quantile, probs=0.90, na.rm=TRUE),
                 apply(adj.se, 2, quantile, probs=0.95, na.rm=TRUE),
                 apply(df, 2, mean, na.rm=TRUE),
                 apply(df, 2, sd, na.rm=TRUE))
    colnames(ret) <- c("Cov", "MedL", "5%L", "90%", "95%L", "mdof", "vardof")

    cbind(se.types, ret)
}

ClSimulation <- function(nsim, nsimb, se.types=ClDefault.se,
                              dc=as.character(seq(10)), printDetail=FALSE) {

    l <- lapply(dc, function(r)
        format(ClSize(nsim, nsimb, se.types, r), digits=2, nsmall=1))

    if(printDetail==TRUE) print(l)

    s <- do.call(cbind,l)
    list(cis=cbind(se.types,s[,names(s)=="Cov" | names(s) == "MedL"]),
         dof=cbind(se.types,s[,names(s)=="mdof"]))
}

ClWildBS <- function(Y, X, Z, ell, Vhat, dof, Xf, beta0,
                     nboot=499, adj.se=TRUE) {
    S <- ncol(Z)
    ols <- lm(Y~0+X)
    beta <- ols$coefficients
    se <- ClSEs(Y, X, Z, ell, Vhat, "N", beta0, Xf=Xf)$adj.se

    ## Draw Rademacher random variables
    vm <- matrix(sample(c(-1,1), S*nboot, replace=TRUE), nrow=S)

    ## Wild Bootstrap test of beta=b0
    Testb0 <- function(b0) {
        Yhat <- mean(Y-X[, 1]*b0) + X[, 1]*b0
        Ym <- if(dof=="wild0")  {
            Yhat + ((Y-Yhat)*Z) %*% vm
        } else {
            drop(X %*% beta)+(ols$residuals*Z) %*% vm
        }

        betam <- solve(crossprod(X), crossprod(X, Ym))
        um <- Ym-X %*% betam
        sem <- sapply(seq(nboot), function(m) ClSE(Vhat, Xf, um[, m], Z, ell))
        tm <- if (dof=="wild0") (betam[1,]-b0)/sem else (betam[1,]-sum(beta*ell))/sem

        t.data <- (sum(beta*ell)-b0)/se
        cv <- quantile(abs(tm), 0.95)
        return(list(adj.t=abs(t.data)*1.96/cv,
                    adj.se=cv*se/qnorm(0.975), cv=cv))
    }

    if (dof=="wild0") {
        ## Assume X=c(W,1) and we're testing the slope
        x <- if (!is.null(adj.se)) {
                 FindMinInterval(function(b0) Testb0(b0)$adj.t>1.96,
                                 beta0+c(-5,5), tol=0.01)
             } else {
                 c(0,0)
             }

        return(list(adj.t=Testb0(beta0)$adj.t,
                    adj.se=((x[2]-x[1]) /(2*qnorm(0.975)))))
    } else {
        return(Testb0(beta0))
    }
}
